Pokemon data analysis with PCA

The files are available in https://github.com/v0369012/Pokemon_PCA.

# Read the Pokemon value list
PKM_values_7 <- readLines("Pokemon_list_g7.txt", encoding = "UTF-8")

To simplify the analysis, we removed Pokemons with special form, like mega, Alolan…

# Loading packages
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------- tidyverse 1.2.1 --
## √ ggplot2 3.2.1     √ purrr   0.3.3
## √ tibble  2.1.3     √ dplyr   0.8.3
## √ tidyr   1.0.0     √ stringr 1.4.0
## √ readr   1.3.1     √ forcats 0.4.0
## -- Conflicts -------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
# Pokemons without specail form
PKM_without_spf <- str_count(PKM_values_7, "\\|") == 8 # Pokemons without specail form
PKM_values_7_without_spf <- PKM_values_7[PKM_without_spf]
PKM_values_7_without_spf_split <- str_split(PKM_values_7_without_spf, "\\|")
# Make a Pokemon table
PKM_values_7_without_spf_df <- data.frame(
  Number = unlist(PKM_values_7_without_spf_split)[seq(2, 792*9, 9)],
  Name = unlist(PKM_values_7_without_spf_split)[seq(3, 792*9, 9)],
  generation = c(rep(1, 151-0), rep(2, 251-151), rep(3, 385-251), rep(4, 490-385), rep(5, 640-490), rep(6, 707-640), rep(7, 792-707)),
  HP = unlist(PKM_values_7_without_spf_split)[seq(4, 792*9, 9)] %>% as.character() %>% as.numeric(),
  ATK = unlist(PKM_values_7_without_spf_split)[seq(5, 792*9, 9)] %>% as.character() %>% as.numeric(),
  DEF = unlist(PKM_values_7_without_spf_split)[seq(6, 792*9, 9)] %>% as.character() %>% as.numeric(),
  SATK = unlist(PKM_values_7_without_spf_split)[seq(7, 792*9, 9)] %>% as.character() %>% as.numeric(),
  SDEF = unlist(PKM_values_7_without_spf_split)[seq(8, 792*9, 9)] %>% as.character() %>% as.numeric(),
  SPEED = unlist(PKM_values_7_without_spf_split)[seq(9, 792*9, 9)] %>% str_replace_all("\\}","") %>% as.character() %>% as.numeric()
)

# Check the table
head(PKM_values_7_without_spf_df)
##   Number               Name generation HP ATK DEF SATK SDEF SPEED
## 1    001           妙蛙种子          1 45  49  49   65   65    45
## 2    002             妙蛙草          1 60  62  63   80   80    60
## 3    003             妙蛙花          1 80  82  83  100  100    80
## 4    004       小火<U+9F99>          1 39  52  43   60   50    65
## 5    005       火恐<U+9F99>          1 58  64  58   80   65    80
## 6    006 <U+55B7>火<U+9F99>          1 78  84  78  109   85   100
# Address types table
PKM_types_7 <- readLines("Pokemon_types.txt")
PKM_types_7_number <- c()
for (i in 1:876) {
  PKM_types_7_number[i] <- str_split(PKM_types_7, "\\|")[[i]][[3]]
}

# Remove Pokemon number containing letters
position_without_letters <- str_detect(PKM_types_7_number, "^[0-9]*$")
PKM_types_7_number_without_letters <- PKM_types_7_number[position_without_letters]

PKM_types_7_name <- c()
for (i in 1:876) {
  PKM_types_7_name[i] <- str_split(PKM_types_7, "\\|")[[i]][[4]]
}

PKM_types_7_types1 <- c()
for (i in 1:876) {
  PKM_types_7_types1[i] <- str_split(PKM_types_7, "\\|")[[i]][[6]] %>% str_remove_all("\\}")
}

# Make Pokemon types 1 table
PKM_types_7_df <- data.frame(
  Number = PKM_types_7_number,
  Name = PKM_types_7_name,
  types1 = PKM_types_7_types1
)

# Check the table
head(PKM_types_7_df)
##   Number       Name types1
## 1    001  Bulbasaur  Grass
## 2    002    Ivysaur  Grass
## 3    003   Venusaur  Grass
## 4    004 Charmander   Fire
## 5    005 Charmeleon   Fire
## 6    006  Charizard   Fire
# Remove number with letters
PKM_types_7_df_t <- filter(PKM_types_7_df, Number %in% PKM_types_7_number_without_letters)
# Merge Pokemon table and types table by number
PKM_merged_df <- merge(PKM_types_7_df_t, PKM_values_7_without_spf_df, by = "Number")
# Remove Chinese names
PKM_merged_df <- PKM_merged_df[,-4]
colnames(PKM_merged_df)[2] <- "Name"

# Check the correlation between the variables
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
head(melt(cor(PKM_merged_df[, 5:ncol(PKM_merged_df)])))
##    Var1 Var2     value
## 1    HP   HP 1.0000000
## 2   ATK   HP 0.4355205
## 3   DEF   HP 0.2326511
## 4  SATK   HP 0.3789334
## 5  SDEF   HP 0.3630897
## 6 SPEED   HP 0.1579874
# Plot heatmap
ggplot(melt(cor(PKM_merged_df[, 5:ncol(PKM_merged_df)])),
       aes(Var1, Var2)) +
  geom_tile(aes(fill = value), colour = "white") +
  scale_fill_gradient2(low = "firebrick4", high = "steelblue",
                       mid = "white", midpoint = 0) +
  guides(fill=guide_legend(title="Correlation")) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
        axis.title = element_blank())

# PCA
pca.model <- prcomp(PKM_merged_df[, 5:ncol(PKM_merged_df)], T)

# Show pca summary
summary(pca.model)
## Importance of components:
##                            PC1     PC2     PC3    PC4      PC5     PC6
## Standard deviation     45.3401 30.5620 26.2863 22.890 18.60068 14.5243
## Proportion of Variance  0.4317  0.1962  0.1451  0.110  0.07266  0.0443
## Cumulative Proportion   0.4317  0.6279  0.7730  0.883  0.95570  1.0000
pca.model$rotation
##             PC1         PC2         PC3         PC4        PC5        PC6
## HP    0.3659171  0.01407945 -0.12663530 -0.79582535  0.3333136 -0.3247060
## ATK   0.4641571 -0.09316829 -0.71487140  0.05861969 -0.1662638  0.4834841
## DEF   0.4194666 -0.64774595  0.04716237  0.36935249 -0.0344900 -0.5144296
## SATK  0.4636844  0.39081599  0.33978416 -0.08918610 -0.7063606 -0.0995336
## SDEF  0.4208148 -0.12777920  0.57761282  0.03901981  0.3998128  0.5581917
## SPEED 0.2874409  0.63441466 -0.14699745  0.46618427  0.4486350 -0.2732882
# Make a pca table to plot
p1_p2_table <- pca.model$x[,1:2] %>% as.data.frame()
rownames(p1_p2_table) <- PKM_merged_df[,1]
legend_number <- c(144:146, 150:151, 
                   243:245, 249:251,
                   377:386,
                   479:494,
                   638:649,
                   716:721,
                   785:809
)
legend <- rep(F, nrow(p1_p2_table))
legend_position <- which(PKM_merged_df[, "Number"] %in% legend_number)
legend[legend_position] <- rep(T, length(legend_position))

p1_p2_table_t <- cbind(p1_p2_table, 
                     Number = PKM_merged_df[, "Number"],
                     Name = PKM_merged_df[, "Name"],
                     generation = PKM_merged_df["generation"],
                     types1 = PKM_merged_df[, "types1"],
                     legend = legend
)

p1_p2_table_t[, "generation"] <- as.character(p1_p2_table_t[, "generation"])

# Visualization
pca_gg <- ggplot(data = p1_p2_table_t, aes(x=PC1, y=PC2, label = Number))+
  geom_point(size = 2.5)
# label with generation
pca_gg_generation <- ggplot(data = p1_p2_table_t, aes(x=PC1, y=PC2, label = Number, color = generation))+
  geom_point(size = 2.5)
# label with first types
pca_gg_types1 <- ggplot(data = p1_p2_table_t, aes(x=PC1, y=PC2, label = Number, color = types1))+
  geom_point(size = 2.5)
# label with legend Pokemon
pca_gg_legend <- ggplot(data = p1_p2_table_t, aes(x=PC1, y=PC2, label = Number, color = legend))+
  geom_point(size = 2.5)

# User-interactive visualization
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
pca_ggly <- ggplotly(pca_gg)
pca_ggly
pca_ggly_generation <- ggplotly(pca_gg_generation)
pca_ggly_generation
pca_ggly_types1 <- ggplotly(pca_gg_types1)
pca_ggly_types1
pca_ggly_legend <- ggplotly(pca_gg_legend)
pca_ggly_legend